<!DOCTYPE html>



  


<html class="theme-next gemini use-motion" lang="zh-Hans">
<head>
  <meta charset="UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>
<meta name="theme-color" content="#222">









<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />
















  
  
  <link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css" />







<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />

<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css" />


  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="Spider基础," />










<meta name="description" content="Spider学习笔记前言:​    网络爬虫（Web Spider。又被称为网页蜘蛛。网络机器人，又称为网页追逐者），是一种依照一定的规则，自己主动的抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁，自己主动索引。模拟程序或者蠕虫。假设把互联网比喻成一个蜘蛛网，那么Spider就是在网上爬来爬去的蜘蛛。 ​    网络蜘蛛是通过网页的链接地址来寻找网页的。从站点某一个页面（一般是首页）">
<meta name="keywords" content="Spider基础">
<meta property="og:type" content="article">
<meta property="og:title" content="Spider(蜘蛛)笔记">
<meta property="og:url" content="http://yoursite.com/2018/06/30/Spider-蜘蛛-笔记/index.html">
<meta property="og:site_name" content="Mr. Lee&#39;s blog">
<meta property="og:description" content="Spider学习笔记前言:​    网络爬虫（Web Spider。又被称为网页蜘蛛。网络机器人，又称为网页追逐者），是一种依照一定的规则，自己主动的抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁，自己主动索引。模拟程序或者蠕虫。假设把互联网比喻成一个蜘蛛网，那么Spider就是在网上爬来爬去的蜘蛛。 ​    网络蜘蛛是通过网页的链接地址来寻找网页的。从站点某一个页面（一般是首页）">
<meta property="og:locale" content="zh-Hans">
<meta property="og:updated_time" content="2018-06-30T03:25:27.000Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Spider(蜘蛛)笔记">
<meta name="twitter:description" content="Spider学习笔记前言:​    网络爬虫（Web Spider。又被称为网页蜘蛛。网络机器人，又称为网页追逐者），是一种依照一定的规则，自己主动的抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁，自己主动索引。模拟程序或者蠕虫。假设把互联网比喻成一个蜘蛛网，那么Spider就是在网上爬来爬去的蜘蛛。 ​    网络蜘蛛是通过网页的链接地址来寻找网页的。从站点某一个页面（一般是首页）">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Gemini',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: 'Author'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="http://yoursite.com/2018/06/30/Spider-蜘蛛-笔记/"/>





  <title>Spider(蜘蛛)笔记 | Mr. Lee's blog</title>
  








</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-Hans">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/"  class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">Mr. Lee's blog</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <h1 class="site-subtitle" itemprop="description">知识改变命运,技术成就梦想</h1>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br />
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-archives">
          <a href="/archives/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-archive"></i> <br />
            
            归档
          </a>
        </li>
      
        
        <li class="menu-item menu-item-tags">
          <a href="/tags/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-tags"></i> <br />
            
            标签
          </a>
        </li>
      
        
        <li class="menu-item menu-item-about">
          <a href="/about/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-user"></i> <br />
            
            关于
          </a>
        </li>
      
        
        <li class="menu-item menu-item-commonweal">
          <a href="/404.html" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-heartbeat"></i> <br />
            
            公益404
          </a>
        </li>
      

      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br />
            
            搜索
          </a>
        </li>
      
    </ul>
  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off"
             placeholder="搜索..." spellcheck="false"
             type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://yoursite.com/2018/06/30/Spider-蜘蛛-笔记/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="Mr. Lee">
      <meta itemprop="description" content="">
      <meta itemprop="image" content="/avatar.png">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Mr. Lee's blog">
    </span>

    
      <header class="post-header">

        
        
          <h2 class="post-title" itemprop="name headline">Spider(蜘蛛)笔记</h2>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2018-06-30T11:23:30+08:00">
                2018-06-30
              </time>
            

            

            
          </span>

          

          
            
          

          
          

          

          
            <div class="post-wordcount">
              
                
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                  <span class="post-meta-item-text">字数统计&#58;</span>
                
                <span title="字数统计">
                  9,003
                </span>
              

              
                <span class="post-meta-divider">|</span>
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-clock-o"></i>
                </span>
                
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                
                <span title="阅读时长">
                  38
                </span>
              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <h4 id="Spider学习笔记"><a href="#Spider学习笔记" class="headerlink" title="Spider学习笔记"></a>Spider学习笔记</h4><h5 id="前言"><a href="#前言" class="headerlink" title="前言:"></a>前言:</h5><p>​    网络爬虫（Web Spider。又被称为网页蜘蛛。网络机器人，又称为网页追逐者），是一种依照一定的规则，自己主动的抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁，自己主动索引。模拟程序或者蠕虫。假设把互联网比喻成一个蜘蛛网，那么Spider就是在网上爬来爬去的蜘蛛。</p>
<p>​    网络蜘蛛是通过网页的链接地址来寻找网页的。从站点某一个页面（一般是首页）開始，读取网页的内容。找到在网页中的其他链接地址。然后通过这些链接地址寻找下一个网页。这样一直循环下去，直到把这个站点全部的网页都抓取完为止。假设把整个互联网当成一个站点。那么网络蜘蛛就能够用这个原理把互联网上全部的网页都抓取下来。这样看来，网络爬虫就是一个爬行程序，一个抓取网页的程序。</p>
<p><strong>简单地说，网络爬虫的基本任务就是抓取网页内容。</strong></p>
<h5 id="1-数据分析和采集"><a href="#1-数据分析和采集" class="headerlink" title="1. 数据分析和采集"></a>1. 数据分析和采集</h5><p>本爬虫教程中使用的python版本统一为python3.X的版本</p>
<h6 id="1-1-数据分析"><a href="#1-1-数据分析" class="headerlink" title="1.1 数据分析"></a>1.1 数据分析</h6><p>爬取网页信息可以使用很多的技术：</p>
<ol>
<li><p>获取网页信息：urllib、urllib3、requests</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">requests为第三方的库，需要安装才能使用</span><br><span class="line"></span><br><span class="line">pip install requests</span><br></pre></td></tr></table></figure>
</li>
<li><p>解析网页信息：beautifulsoup4(bs4)、re、xpath、lxml</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">bs4为第三方的库，需要安装才能使用</span><br><span class="line"></span><br><span class="line">pip install beautifulsoup4</span><br><span class="line"></span><br><span class="line">使用的时候 from bs4 import BeautifulSoup 这样导入</span><br></pre></td></tr></table></figure>
</li>
</ol>
<p>Python 标准库中自带了 xml 模块，但是性能不够好，而且缺乏一些人性化的 API，相比之下，第三方库 lxml 是用 Cython 实现的，而且增加了很多实用的功能。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">安装lxml，在新版本中无法使用from lxml import etree</span><br><span class="line"> </span><br><span class="line"> pip install lxml 并不推荐这样去安装lxml</span><br><span class="line"></span><br><span class="line"> 推荐安装的方法：访问网站(https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml)下载lxml的安装whl文件，然后进行安装。</span><br></pre></td></tr></table></figure>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install lxml-4.2.1-cp36-cp36m-win_amd64.whl</span><br></pre></td></tr></table></figure>
<ol>
<li><p>动态数据解析</p>
<p>通用：selenium(自动化测试框架)</p>
</li>
</ol>
<h6 id="1-2-数据采集"><a href="#1-2-数据采集" class="headerlink" title="1.2 数据采集"></a>1.2 数据采集</h6><ol>
<li><p>存储：mysql、redis、mongodb、sqlalchemy</p>
</li>
<li><p>序列化：json</p>
</li>
<li><p>调度器：进程、线程、协程</p>
<p>​</p>
</li>
</ol>
<h5 id="2-请求头分析"><a href="#2-请求头分析" class="headerlink" title="2. 请求头分析"></a>2. 请求头分析</h5><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br></pre></td><td class="code"><pre><span class="line"># 浏览器告诉服务器可以接收的文本类型, */*表示任何类型都可以接收</span><br><span class="line">Accept: text/html, */*;q=0.8</span><br><span class="line"></span><br><span class="line"># 浏览器告诉服务器，数据可以压缩，页面可以解压数据然后进行渲染。做爬虫的时候，最好不要写该参数</span><br><span class="line">Accept-Encoding: gzip, deflate </span><br><span class="line"></span><br><span class="line"># 语言类型</span><br><span class="line">Accept-Language: zh-CN,zh;q=0.9 </span><br><span class="line"></span><br><span class="line">Cache-Control: max-age=0</span><br><span class="line"></span><br><span class="line"># 保持连接</span><br><span class="line">Connection: keep-alive </span><br><span class="line"></span><br><span class="line"># 会话 </span><br><span class="line">Cookie: Hm_lvt_3bfcc098e0da26d58c321ba579b04b2f=1527581188,1528137133</span><br><span class="line"></span><br><span class="line"># 域名</span><br><span class="line">Host: www.cdtopspeed.com </span><br><span class="line"></span><br><span class="line">Upgrade-Insecure-Requests: 1</span><br><span class="line"></span><br><span class="line"># 用户代理, 使得服务器能够识别请求是通过浏览器请求过来的，其中包含浏览器的名称/版本等信息</span><br><span class="line">User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36</span><br></pre></td></tr></table></figure>
<p>其中在爬虫中最重要的就是User-Agent：在下面urllib的使用中就会详细的解释User-Agent的使用</p>
<h5 id="3-urllib库的使用"><a href="#3-urllib库的使用" class="headerlink" title="3.urllib库的使用"></a>3.urllib库的使用</h5><p><code>urllib</code>是Python自带的标准库，无需安装，直接可以用。</p>
<p>提供了如下功能：</p>
<ul>
<li>网页请求   </li>
<li>响应获取</li>
<li>代理和cookie设置</li>
<li>异常处理</li>
<li>URL解析</li>
</ul>
<blockquote>
<p>爬虫所需要的功能，基本上在<code>urllib</code>中都能找到，学习这个标准库，可以更加深入的理解后面更加便利的<code>requests</code>库。</p>
</blockquote>
<h6 id="3-1-发起请求"><a href="#3-1-发起请求" class="headerlink" title="3.1 发起请求"></a>3.1 发起请求</h6><p>模拟浏览器发起一个 HTTP 请求，我们需要用到 urllib.request 模块。urllib.request 的作用不仅仅是发起请求， 还能获取请求返回结果。发起请求，单靠 <code>urlopen()</code> 方法就可以叱咤风云。我们先看下 urlopen() 的 API</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">urllib.request.urlopen(url, data=<span class="keyword">None</span>, [timeout, ]*, cafile=<span class="keyword">None</span>, capath=<span class="keyword">None</span>, cadefault=<span class="keyword">False</span>, context=<span class="keyword">None</span>)</span><br></pre></td></tr></table></figure>
<blockquote>
<ol>
<li>第一个参数String 类型的地址</li>
<li><code>data</code>是bytes类型的的内容,可以通过bytes()函数转化字节流,它也是可选参数.使用data参数.请求方式变成以POST方式提交表单.使用标准格式是<code>application/x-www-form-urlencoded</code></li>
<li><code>timeout</code> 参数是用于设置请求超时时间,单位是秒.</li>
<li><code>cafile</code>和<code>capath</code>代表CA证书和CA证书的路径.如果使用HTTPS则需要用到.</li>
<li><code>context</code>参数是<code>ssl.SSLContext</code>类型,用来指定SSL设置</li>
<li><code>cadefault</code>参数已经被弃用.</li>
<li>该方法也可以单独传入<code>urllib.request.Resquest</code>对象</li>
<li>该函数返回结果是一个<code>http.client.HTTPResponse</code>对象</li>
</ol>
</blockquote>
<h6 id="3-2-简单抓取网页"><a href="#3-2-简单抓取网页" class="headerlink" title="3.2 简单抓取网页"></a>3.2 简单抓取网页</h6><p>我们去获取百度首页的源代码</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> urllib.request</span><br><span class="line"></span><br><span class="line">url = <span class="string">"http://www.baidu.com"</span></span><br><span class="line">response = urllib.request.urlopen(url)</span><br><span class="line">html = response.read()         <span class="comment"># 获取到页面的源代码</span></span><br><span class="line">print(html.decode(<span class="string">'utf-8'</span>))    <span class="comment"># 转化为 utf-8 编码</span></span><br></pre></td></tr></table></figure>
<h6 id="3-2-设置请求超时"><a href="#3-2-设置请求超时" class="headerlink" title="3.2 设置请求超时"></a>3.2 设置请求超时</h6><p>有些请求可能因为网络原因无法得到响应。因此，我们可以手动设置超时时间。当请求超时，我们可以采取进一步措施，例如选择直接丢弃该请求或者再请求一次。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> urllib.request</span><br><span class="line"></span><br><span class="line">url = <span class="string">"http://www.baidu.com"</span></span><br><span class="line">response = urllib.request.urlopen(url, timeout=<span class="number">1</span>)</span><br><span class="line">print(response.read().decode(<span class="string">'utf-8'</span>))</span><br></pre></td></tr></table></figure>
<h6 id="3-3-ssl认证"><a href="#3-3-ssl认证" class="headerlink" title="3.3 ssl认证"></a>3.3 ssl认证</h6><p>什么是 SSL 证书？</p>
<p>SSL 证书就是遵守 SSL 安全套接层协议的服务器数字证书。</p>
<p>而 SSL 安全协议最初是由美国网景 Netscape Communication 公司设计开发的，全称为：安全套接层协议 (Secure Sockets Layer) ， 它指定了在应用程序协议 ( 如 HTTP 、 Telnet 、 FTP) 和 TCP/IP 之间提供数据安全性分层的机制，它是在传输通信协议 (TCP/IP) 上实现的一种安全协议，采用公开密钥技术，它为 TCP/IP 连接提供数据加密、服务器认证、消息完整性以及可选的客户机认证。由于此协议很好地解决了互联网明文传输的不安全问题，很快得到了业界的支持，并已经成为国际标准。</p>
<p>SSL 证书由浏览器中“受信任的根证书颁发机构”在验证服务器身份后颁发，具有网站身份验证和加密传输双重功能。</p>
<p>如果能使用 https:// 来访问某个网站，就表示此网站是部署了SSL证书。一般来讲，如果此网站部署了SSL证书，则在需要加密的页面会自动从 http:// 变为 https:// ，如果没有变，你认为此页面应该加密，您也可以尝试直接手动在浏览器地址栏的http后面加上一个英文字母“ s ”后回车，如果能正常访问并出现安全锁，则表明此网站实际上是部署了SSL证书，只是此页面没有做 https:// 链接；如果不能访问，则表明此网站没有部署 SSL证书。</p>
<p>有些时候我们在请求的时候回出现如下这样的错误信息.</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">urllib.error.URLError: &lt;urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)&gt;</span><br></pre></td></tr></table></figure>
<p>如果不忽略ssl的安全认证的话，网页的源码会提示ssl认证问题，需要提供ssl认证。我们在做爬虫的时候，自动设置忽略掉ssl认证即可。 如下案例.在京东首页搜索某个商品能出现多少结果.</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> re</span><br><span class="line"><span class="keyword">import</span> urllib.request</span><br><span class="line"></span><br><span class="line"><span class="comment"># 使用urllib进行中文的编码和解码</span></span><br><span class="line"><span class="keyword">from</span> urllib <span class="keyword">import</span> parse</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> ssl</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">main</span><span class="params">(url)</span>:</span></span><br><span class="line">    <span class="comment"># 请求头</span></span><br><span class="line">    header = &#123;</span><br><span class="line">        <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line">    <span class="comment"># 设置忽略ssl认证</span></span><br><span class="line">    context = ssl._create_unverified_context()</span><br><span class="line">    <span class="comment"># 发起请求</span></span><br><span class="line">    req = urllib.request.Request(url, headers=header)</span><br><span class="line">    <span class="comment"># 得到响应数据</span></span><br><span class="line">    res = urllib.request.urlopen(req, context=context)</span><br><span class="line">	<span class="comment"># 使用正则匹配需要的数据</span></span><br><span class="line">    conent_re = re.findall(<span class="string">'page_count:"(\d+)"'</span>, res.read().decode(<span class="string">'utf-8'</span>))</span><br><span class="line">    </span><br><span class="line">    print(<span class="string">'共有'</span> + conent_re[<span class="number">0</span>] + <span class="string">'页数据'</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">"__main__"</span>:</span><br><span class="line">    </span><br><span class="line">    msg = input(<span class="string">'请输入搜索信息:'</span>)</span><br><span class="line">    </span><br><span class="line">    <span class="comment"># 使用urllib进行中文的编码和解码</span></span><br><span class="line">    search = parse.urlencode(&#123;<span class="string">'keyword'</span>: msg&#125;)</span><br><span class="line">    </span><br><span class="line">    url = <span class="string">'https://search.jd.com/Search?%s'</span> % search</span><br><span class="line">    main(url)</span><br></pre></td></tr></table></figure>
<h5 id="4-使用urllib进行中文的编码和解码"><a href="#4-使用urllib进行中文的编码和解码" class="headerlink" title="4. 使用urllib进行中文的编码和解码"></a>4. 使用urllib进行中文的编码和解码</h5><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line">from urllib import parse</span><br><span class="line"></span><br><span class="line"># 编码</span><br><span class="line">enstr = parse.urlencode(&#123;&apos;kd&apos;: &apos;忠林&apos;&#125;)</span><br><span class="line"># 打印的结果为 kd=%E5%BF%A0%E6%9E%97</span><br><span class="line">print(enstr)</span><br><span class="line"></span><br><span class="line"># 解码</span><br><span class="line">destr = parse.unquote(enstr)</span><br><span class="line"># 解码的结果为 kd=忠林</span><br><span class="line">print(destr)</span><br></pre></td></tr></table></figure>
<p>案例1,爬取格言网中的<code>100句关于梦想的名言警句</code></p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> re</span><br><span class="line"><span class="keyword">import</span> urllib.request</span><br><span class="line"><span class="keyword">import</span> ssl</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_motto</span><span class="params">()</span>:</span></span><br><span class="line">    header = &#123;</span><br><span class="line">        <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line">    <span class="comment"># 请求的url</span></span><br><span class="line">    url = <span class="string">'https://www.geyanw.com/mingyanjingju/1857.html'</span></span><br><span class="line">    </span><br><span class="line">    context = ssl._create_unverified_context()</span><br><span class="line">    </span><br><span class="line">    req = urllib.request.Request(url, headers=header)</span><br><span class="line">    </span><br><span class="line">    res = urllib.request.urlopen(req, context=context)</span><br><span class="line">    </span><br><span class="line">    <span class="comment"># 正则匹配</span></span><br><span class="line">    pattern = re.compile(<span class="string">'&lt;p&gt;(.*?)&lt;/p&gt;'</span>)</span><br><span class="line"></span><br><span class="line">    content_result = re.findall(pattern, res.read().decode(<span class="string">'gbk'</span>))</span><br><span class="line">	</span><br><span class="line">    <span class="comment"># 写入文件中去</span></span><br><span class="line">    <span class="keyword">with</span> open(<span class="string">'12.txt'</span>, <span class="string">'w+'</span>, encoding=<span class="string">'utf-8'</span>) <span class="keyword">as</span> f:</span><br><span class="line">        <span class="keyword">for</span> i <span class="keyword">in</span> content_result:</span><br><span class="line">            <span class="keyword">if</span> i != <span class="string">'&amp;nbsp;'</span>:</span><br><span class="line">                f.writelines(i + <span class="string">'\n'</span>)</span><br><span class="line">        <span class="comment"># 关闭文件对象       </span></span><br><span class="line">        f.close()</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">"__main__"</span>:</span><br><span class="line">    get_motto()</span><br></pre></td></tr></table></figure>
<h5 id="5-数据采集"><a href="#5-数据采集" class="headerlink" title="5. 数据采集"></a>5. 数据采集</h5><p>数据采集，针对网页获取源码，按照一定的正则匹配，或者xpath的规则去匹配出我们需要的结果，进行分类筛选入库等操作。在本章中会讲到requests，beautifulsoup等工具去爬取网页，获取相关需要的信息。</p>
<h6 id="5-1-BeautifSoup库的使用"><a href="#5-1-BeautifSoup库的使用" class="headerlink" title="5.1 BeautifSoup库的使用"></a>5.1 BeautifSoup库的使用</h6><p>Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.—–引入<a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html" target="_blank" rel="noopener">官网地址</a>的一句话</p>
<p>Beautiful Soup 4 通过PyPi发布,所以如果你无法使用系统包管理安装,那么也可以通过 easy_install 或 pip 来安装.包的名字是 beautifulsoup4 ,这个包兼容Python2和Python3.</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install beautifulsoup4</span><br></pre></td></tr></table></figure>
<h6 id="5-2-解析语法、find、find-all"><a href="#5-2-解析语法、find、find-all" class="headerlink" title="5.2 解析语法、find、find_all"></a>5.2 解析语法、find、find_all</h6><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">find_all( name , attrs , recursive , text , **kwargs )</span><br></pre></td></tr></table></figure>
<p>find_all() 方法搜索当前tag的所有tag子节点,并判断是否符合过滤器的条件</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line">1. 查询所有a标签的内容</span><br><span class="line"></span><br><span class="line">	soup.find_all(&apos;a&apos;)</span><br><span class="line"></span><br><span class="line">2. 查询所有a标签下class样式为bb的内容</span><br><span class="line"></span><br><span class="line">	soup.find_all(&apos;a&apos;, &apos;bb&apos;)</span><br><span class="line"></span><br><span class="line">3. 查询所有id样式为cc的内容</span><br><span class="line"></span><br><span class="line">	soup.find_all(id=&apos;cc&apos;)</span><br></pre></td></tr></table></figure>
<p>案例: 爬去知乎发现里面的问答</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"><span class="keyword">import</span> pymysql</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_mysql</span><span class="params">(sql, params_list)</span>:</span></span><br><span class="line">    <span class="comment"># 建立连接</span></span><br><span class="line">    conn = pymysql.connect(port=<span class="number">3306</span>, host=<span class="string">'localhost'</span>, password=<span class="string">'123456'</span>, </span><br><span class="line">                           charset=<span class="string">'utf8'</span>, user=<span class="string">'root'</span>,database=<span class="string">'spider'</span>)</span><br><span class="line">    <span class="comment"># 创建游标对象</span></span><br><span class="line">    cursor = conn.cursor()</span><br><span class="line">    <span class="comment"># 添加数据</span></span><br><span class="line">    cursor.executemany(sql, params_list)</span><br><span class="line">    <span class="comment"># 提交</span></span><br><span class="line">    conn.commit()</span><br><span class="line">    <span class="comment"># 关闭游标连接</span></span><br><span class="line">    conn.close()</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">start_crawl</span><span class="params">(url)</span>:</span></span><br><span class="line">    headers = &#123;</span><br><span class="line">        <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line">    res = requests.get(url, headers=headers)</span><br><span class="line">    </span><br><span class="line">    soup = BeautifulSoup(res.text, <span class="string">'lxml'</span>)</span><br><span class="line">    <span class="comment"># 筛选类为question_link 的a标签</span></span><br><span class="line">    a_links = soup.find_all(<span class="string">'a'</span>, <span class="string">'question_link'</span>)</span><br><span class="line"></span><br><span class="line">    result_list = []</span><br><span class="line"></span><br><span class="line">    <span class="keyword">for</span> link <span class="keyword">in</span> a_links:</span><br><span class="line">        <span class="comment"># 获取问题的连接</span></span><br><span class="line">        answer_link = <span class="string">'https://www.zhihu.com'</span> + link.get(<span class="string">'href'</span>)</span><br><span class="line">        <span class="comment"># 获取问题的标题</span></span><br><span class="line">        title = link.get_text().replace(<span class="string">'\n'</span>, <span class="string">''</span>)</span><br><span class="line">        <span class="comment"># 追加结果到列表</span></span><br><span class="line">        result_list.append([title, answer_link])</span><br><span class="line">    <span class="comment"># 创建sql语句</span></span><br><span class="line">    sql = <span class="string">'insert into result_news values (%s, %s)'</span></span><br><span class="line">    <span class="comment"># 写入数据库</span></span><br><span class="line">    get_mysql(sql, result_list)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line">    url = <span class="string">'https://www.zhihu.com/explore'</span></span><br><span class="line">    start_crawl(url)</span><br></pre></td></tr></table></figure>
<h5 id="6-requests库的使用"><a href="#6-requests库的使用" class="headerlink" title="6. requests库的使用"></a>6. requests库的使用</h5><p><a href="http://docs.python-requests.org/zh_CN/latest/user/quickstart.html" target="_blank" rel="noopener">中文官网地址</a></p>
<h6 id="6-1安装"><a href="#6-1安装" class="headerlink" title="6.1安装"></a>6.1安装</h6><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install requests</span><br></pre></td></tr></table></figure>
<h6 id="6-2-发送请求，GET、POST、PUT、PATCH、DELETE"><a href="#6-2-发送请求，GET、POST、PUT、PATCH、DELETE" class="headerlink" title="6.2 发送请求，GET、POST、PUT、PATCH、DELETE"></a>6.2 发送请求，GET、POST、PUT、PATCH、DELETE</h6><p>使用 Requests 发送网络请求非常简单。</p>
<p>一开始要导入 Requests 模块：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br></pre></td></tr></table></figure>
<p>然后，尝试获取某个网页。本例子中，我们来获取 Github 的公共时间线：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">r = requests.get(<span class="string">'https://api.github.com/events'</span>)</span><br></pre></td></tr></table></figure>
<p>现在，我们有一个名为 r 的 Response 对象。我们可以从这个对象中获取所有我们想要的信息。</p>
<p>Requests 简便的 API 意味着所有 HTTP 请求类型都是显而易见的。例如，你可以这样发送一个 HTTP POST 请求：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">r = requests.post(<span class="string">'http://httpbin.org/post'</span>, data = &#123;<span class="string">'key'</span>:<span class="string">'value'</span>&#125;)</span><br></pre></td></tr></table></figure>
<p>漂亮，对吧？那么其他 HTTP 请求类型：PUT，DELETE，HEAD 以及 OPTIONS 又是如何的呢？都是一样的简单：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">r = requests.put(<span class="string">'http://httpbin.org/put'</span>, data = &#123;<span class="string">'key'</span>:<span class="string">'value'</span>&#125;)</span><br><span class="line"></span><br><span class="line">r = requests.delete(<span class="string">'http://httpbin.org/delete'</span>)</span><br><span class="line"></span><br><span class="line">r = requests.head(<span class="string">'http://httpbin.org/get'</span>)</span><br><span class="line"></span><br><span class="line">r = requests.options(<span class="string">'http://httpbin.org/get'</span>)</span><br></pre></td></tr></table></figure>
<p>都很不错吧，但这也仅是 Requests 的冰山一角呢。</p>
<h6 id="6-3-传递-URL-参数"><a href="#6-3-传递-URL-参数" class="headerlink" title="6.3 传递 URL 参数"></a>6.3 传递 URL 参数</h6><p>你也许经常想为 URL 的查询字符串(query string)传递某种数据。如果你是手工构建 URL，那么数据会以键/值对的形式置于 URL 中，跟在一个问号的后面。例如， httpbin.org/get?key=val。</p>
<p>Requests 允许你使用 params 关键字参数，以一个字符串字典来提供这些参数。</p>
<p>举例来说，如果你想传递 key1=value1 和 key2=value2 到 httpbin.org/get ，那么你可以使用如下代码：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">payload = &#123;<span class="string">'key1'</span>: <span class="string">'value1'</span>, <span class="string">'key2'</span>: <span class="string">'value2'</span>&#125;</span><br><span class="line"></span><br><span class="line">r = requests.get(<span class="string">"http://httpbin.org/get"</span>, params=payload)</span><br></pre></td></tr></table></figure>
<p>通过打印输出该 URL，你能看到 URL 已被正确编码：</p>
<figure class="highlight"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">print(r.url)</span><br><span class="line"></span><br><span class="line">http://httpbin.org/get?key2=value2&amp;key1=value1</span><br></pre></td></tr></table></figure>
<p>注意字典里值为 None 的键都不会被添加到 URL 的查询字符串里。</p>
<p>你还可以将一个列表作为值传入：</p>
<figure class="highlight"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">payload = &#123;<span class="string">'key1'</span>: <span class="string">'value1'</span>, <span class="string">'key2'</span>: [<span class="string">'value2'</span>, <span class="string">'value3'</span>]&#125;</span><br><span class="line"></span><br><span class="line">r = requests.get(<span class="string">'http://httpbin.org/get'</span>, params=payload)</span><br><span class="line"></span><br><span class="line">print(r.url)</span><br><span class="line"></span><br><span class="line">http://httpbin.org/get?key1=value1&amp;key2=value2&amp;key2=value3</span><br></pre></td></tr></table></figure>
<h6 id="6-4-响应内容"><a href="#6-4-响应内容" class="headerlink" title="6.4 响应内容"></a>6.4 响应内容</h6><p>我们能读取服务器响应的内容。再次以 GitHub 时间线为例：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line">r = requests.get(<span class="string">'https://api.github.com/events'</span>)</span><br><span class="line">r.text</span><br><span class="line"></span><br><span class="line"><span class="string">u'[&#123;"repository":&#123;"open_issues":0,"url":"https://github.com/...</span></span><br></pre></td></tr></table></figure>
<p>Requests 会自动解码来自服务器的内容。大多数 unicode 字符集都能被无缝地解码。</p>
<p>请求发出后，Requests 会基于 HTTP 头部对响应的编码作出有根据的推测。当你访问 r.text 之时，Requests 会使用其推测的文本编码。你可以找出 Requests 使用了什么编码，并且能够使用 r.encoding 属性来改变它：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">r.encoding</span><br><span class="line"><span class="string">'utf-8'</span></span><br><span class="line"></span><br><span class="line">r.encoding = <span class="string">'ISO-8859-1'</span></span><br></pre></td></tr></table></figure>
<p>如果你改变了编码，每当你访问 r.text ，Request 都将会使用 r.encoding 的新值。你可能希望在使用特殊逻辑计算出文本的编码的情况下来修改编码。比如 HTTP 和 XML 自身可以指定编码。这样的话，你应该使用 r.content 来找到编码，然后设置 r.encoding 为相应的编码。这样就能使用正确的编码解析 r.text 了。</p>
<p>在你需要的情况下，Requests 也可以使用定制的编码。如果你创建了自己的编码，并使用 codecs 模块进行注册，你就可以轻松地使用这个解码器名称作为 r.encoding 的值， 然后由 Requests 来为你处理编码。</p>
<h6 id="6-5二进制响应内容"><a href="#6-5二进制响应内容" class="headerlink" title="6.5二进制响应内容"></a>6.5二进制响应内容</h6><p>你也能以字节的方式访问请求响应体，对于非文本请求：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">r.content</span><br><span class="line"></span><br><span class="line"><span class="string">b'[&#123;"repository":&#123;"open_issues":0,"url":"https://github.com/...</span></span><br></pre></td></tr></table></figure>
<p>Requests 会自动为你解码 gzip 和 deflate 传输编码的响应数据。</p>
<p>例如，以请求返回的二进制数据创建一张图片，你可以使用如下代码：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> PIL <span class="keyword">import</span> Image</span><br><span class="line"><span class="keyword">from</span> io <span class="keyword">import</span> BytesIO</span><br><span class="line"></span><br><span class="line">i = Image.open(BytesIO(r.content))</span><br></pre></td></tr></table></figure>
<h6 id="6-6-JSON-响应内容"><a href="#6-6-JSON-响应内容" class="headerlink" title="6.6 JSON 响应内容"></a>6.6 JSON 响应内容</h6><p>Requests 中也有一个内置的 JSON 解码器，助你处理 JSON 数据：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"></span><br><span class="line">r = requests.get(<span class="string">'https://api.github.com/events'</span>)</span><br><span class="line">r.json()</span><br><span class="line">[&#123;<span class="string">u'repository'</span>: &#123;<span class="string">u'open_issues'</span>: <span class="number">0</span>, <span class="string">u'url'</span>: <span class="string">'https://github.com/...</span></span><br></pre></td></tr></table></figure>
<p>如果 JSON 解码失败， r.json() 就会抛出一个异常。例如，响应内容是 401 (Unauthorized)，尝试访问 r.json() 将会抛出 ValueError: No JSON object could be decoded 异常。</p>
<p>需要注意的是，成功调用 r.json() 并<strong>不</strong>意味着响应的成功。有的服务器会在失败的响应中包含一个 JSON 对象（比如 HTTP 500 的错误细节）。这种 JSON 会被解码返回。要检查请求是否成功，请使用 r.raise_for_status() 或者检查 r.status_code 是否和你的期望相同。</p>
<h6 id="6-7-原始响应内容"><a href="#6-7-原始响应内容" class="headerlink" title="6.7 原始响应内容"></a>6.7 原始响应内容</h6><p>在罕见的情况下，你可能想获取来自服务器的原始套接字响应，那么你可以访问 r.raw。 如果你确实想这么干，那请你确保在初始请求中设置了 stream=True。具体你可以这么做：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">r = requests.get(<span class="string">'https://api.github.com/events'</span>, stream=<span class="keyword">True</span>)</span><br><span class="line">r.raw</span><br><span class="line">&lt;requests.packages.urllib3.response.HTTPResponse object at <span class="number">0x101194810</span>&gt;</span><br><span class="line">r.raw.read(<span class="number">10</span>)</span><br><span class="line"><span class="string">'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03'</span></span><br></pre></td></tr></table></figure>
<p>但一般情况下，你应该以下面的模式将文本流保存到文件：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">with</span> open(filename, <span class="string">'wb'</span>) <span class="keyword">as</span> fd:</span><br><span class="line">    <span class="keyword">for</span> chunk <span class="keyword">in</span> r.iter_content(chunk_size):</span><br><span class="line">        fd.write(chunk)</span><br></pre></td></tr></table></figure>
<p>使用 Response.iter_content 将会处理大量你直接使用 Response.raw 不得不处理的。 当流下载时，上面是优先推荐的获取内容方式。 Note that chunk_size can be freely adjusted to a number that may better fit your use cases.</p>
<h6 id="6-8-定制请求头"><a href="#6-8-定制请求头" class="headerlink" title="6.8 定制请求头"></a>6.8 定制请求头</h6><p>如果你想为请求添加 HTTP 头部，只要简单地传递一个 dict 给 headers 参数就可以了。</p>
<p>例如，在前一个示例中我们没有指定 content-type:</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">url = <span class="string">'https://api.github.com/some/endpoint'</span></span><br><span class="line">headers = &#123;<span class="string">'user-agent'</span>: <span class="string">'my-app/0.0.1'</span>&#125;</span><br><span class="line"></span><br><span class="line">r = requests.get(url, headers=headers)</span><br></pre></td></tr></table></figure>
<p>注意: 定制 header 的优先级低于某些特定的信息源，例如：</p>
<p>如果在 .netrc 中设置了用户认证信息，使用 headers= 设置的授权就不会生效。而如果设置了 auth= 参数，<code>.netrc</code> 的设置就无效了。</p>
<p>如果被重定向到别的主机，授权 header 就会被删除。</p>
<p>代理授权 header 会被 URL 中提供的代理身份覆盖掉。</p>
<p>在我们能判断内容长度的情况下，header 的 Content-Length 会被改写。</p>
<p>更进一步讲，Requests 不会基于定制 header 的具体情况改变自己的行为。只不过在最后的请求中，所有的 header 信息都会被传递进去。</p>
<p>注意: 所有的 header 值必须是 string、bytestring 或者 unicode。尽管传递 unicode header 也是允许的，但不建议这样做。</p>
<p>更加复杂的 POST 请求</p>
<p>通常，你想要发送一些编码为表单形式的数据——非常像一个 HTML 表单。要实现这个，只需简单地传递一个字典给 data 参数。你的数据字典在发出请求时会自动编码为表单形式：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line">payload = &#123;<span class="string">'key1'</span>: <span class="string">'value1'</span>, <span class="string">'key2'</span>: <span class="string">'value2'</span>&#125;</span><br><span class="line"></span><br><span class="line">r = requests.post(<span class="string">"http://httpbin.org/post"</span>, data=payload)</span><br><span class="line"></span><br><span class="line">print(r.text)</span><br><span class="line"></span><br><span class="line">&#123;</span><br><span class="line">  ...</span><br><span class="line">  <span class="string">"form"</span>: &#123;</span><br><span class="line">    <span class="string">"key2"</span>: <span class="string">"value2"</span>,</span><br><span class="line">    <span class="string">"key1"</span>: <span class="string">"value1"</span></span><br><span class="line">  &#125;,</span><br><span class="line">  ...</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>你还可以为 data 参数传入一个元组列表。在表单中多个元素使用同一 key 的时候，这种方式尤其有效：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line">payload = ((<span class="string">'key1'</span>, <span class="string">'value1'</span>), (<span class="string">'key1'</span>, <span class="string">'value2'</span>))</span><br><span class="line">r = requests.post(<span class="string">'http://httpbin.org/post'</span>, data=payload)</span><br><span class="line">print(r.text)</span><br><span class="line"></span><br><span class="line">&#123;</span><br><span class="line">  ...</span><br><span class="line">  <span class="string">"form"</span>: &#123;</span><br><span class="line">    <span class="string">"key1"</span>: [</span><br><span class="line">      <span class="string">"value1"</span>,</span><br><span class="line">      <span class="string">"value2"</span></span><br><span class="line">    ]</span><br><span class="line">  &#125;,</span><br><span class="line">  ...</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>很多时候你想要发送的数据并非编码为表单形式的。如果你传递一个 string 而不是一个 dict，那么数据会被直接发布出去。</p>
<p>例如，Github API v3 接受编码为 JSON 的 POST/PATCH 数据：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> json</span><br><span class="line"></span><br><span class="line">url = <span class="string">'https://api.github.com/some/endpoint'</span></span><br><span class="line">payload = &#123;<span class="string">'some'</span>: <span class="string">'data'</span>&#125;</span><br><span class="line"></span><br><span class="line">r = requests.post(url, data=json.dumps(payload))</span><br></pre></td></tr></table></figure>
<p>此处除了可以自行对 dict 进行编码，你还可以使用 json 参数直接传递，然后它就会被自动编码。这是 2.4.2 版的新加功能：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">url = <span class="string">'https://api.github.com/some/endpoint'</span></span><br><span class="line">payload = &#123;<span class="string">'some'</span>: <span class="string">'data'</span>&#125;</span><br><span class="line"></span><br><span class="line">r = requests.post(url, json=payload)</span><br></pre></td></tr></table></figure>
<p>POST一个多部分编码(Multipart-Encoded)的文件</p>
<p>Requests 使得上传多部分编码文件变得很简单：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line">url = <span class="string">'http://httpbin.org/post'</span></span><br><span class="line">files = &#123;<span class="string">'file'</span>: open(<span class="string">'report.xls'</span>, <span class="string">'rb'</span>)&#125;</span><br><span class="line"></span><br><span class="line">r = requests.post(url, files=files)</span><br><span class="line">r.text</span><br><span class="line"></span><br><span class="line">&#123;</span><br><span class="line">  ...</span><br><span class="line">  <span class="string">"files"</span>: &#123;</span><br><span class="line">    <span class="string">"file"</span>: <span class="string">"&lt;censored...binary...data&gt;"</span></span><br><span class="line">  &#125;,</span><br><span class="line">  ...</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>你可以显式地设置文件名，文件类型和请求头：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></td><td class="code"><pre><span class="line">url = <span class="string">'http://httpbin.org/post'</span></span><br><span class="line">files = &#123;<span class="string">'file'</span>: (<span class="string">'report.xls'</span>, open(<span class="string">'report.xls'</span>, <span class="string">'rb'</span>), <span class="string">'application/vnd.ms-excel'</span>, &#123;<span class="string">'Expires'</span>: <span class="string">'0'</span>&#125;)&#125;</span><br><span class="line"></span><br><span class="line">r = requests.post(url, files=files)</span><br><span class="line">r.text</span><br><span class="line">&#123;</span><br><span class="line">  ...</span><br><span class="line">  <span class="string">"files"</span>: &#123;</span><br><span class="line">    <span class="string">"file"</span>: <span class="string">"&lt;censored...binary...data&gt;"</span></span><br><span class="line">  &#125;,</span><br><span class="line">  ...</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>如果你想，你也可以发送作为文件来接收的字符串：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></td><td class="code"><pre><span class="line">url = <span class="string">'http://httpbin.org/post'</span></span><br><span class="line">files = &#123;<span class="string">'file'</span>: (<span class="string">'report.csv'</span>, <span class="string">'some,data,to,send\nanother,row,to,send\n'</span>)&#125;</span><br><span class="line"></span><br><span class="line">r = requests.post(url, files=files)</span><br><span class="line">r.text</span><br><span class="line">&#123;</span><br><span class="line">  ...</span><br><span class="line">  <span class="string">"files"</span>: &#123;</span><br><span class="line">    <span class="string">"file"</span>: <span class="string">"some,data,to,send\\nanother,row,to,send\\n"</span></span><br><span class="line">  &#125;,</span><br><span class="line">  ...</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>如果你发送一个非常大的文件作为 multipart/form-data 请求，你可能希望将请求做成数据流。默认下 requests 不支持, 但有个第三方包 requests-toolbelt 是支持的。你可以阅读 toolbelt 文档 来了解使用方法。</p>
<h6 id=""><a href="#" class="headerlink" title=" "></a> </h6><blockquote>
<p><strong>警告:</strong></p>
<p>我们强烈建议你用二进制模式(binary mode)打开文件。这是因为 Requests 可能会试图为你提供 Content-Length header，在它这样做的时候，这个值会被设为文件的字节数（bytes）。如果用文本模式(text mode)打开文件，就可能会发生错误。</p>
</blockquote>
<h6 id="6-9-响应状态码"><a href="#6-9-响应状态码" class="headerlink" title="6.9 响应状态码"></a>6.9 响应状态码</h6><p>我们可以检测响应状态码：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">r = requests.get(<span class="string">'http://httpbin.org/get'</span>)</span><br><span class="line">r.status_code</span><br><span class="line"><span class="number">200</span></span><br></pre></td></tr></table></figure>
<p>为方便引用，Requests还附带了一个内置的状态码查询对象：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">r.status_code == requests.codes.ok</span><br><span class="line"></span><br><span class="line"><span class="keyword">True</span></span><br></pre></td></tr></table></figure>
<p>如果发送了一个错误请求(一个 4XX 客户端错误，或者 5XX 服务器错误响应)，我们可以通过 Response.raise_for_status() 来抛出异常：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line">bad_r = requests.get(<span class="string">'http://httpbin.org/status/404'</span>)</span><br><span class="line">bad_r.status_code</span><br><span class="line"><span class="number">404</span></span><br><span class="line"></span><br><span class="line">bad_r.raise_for_status()</span><br><span class="line">Traceback (most recent call last):</span><br><span class="line">	File <span class="string">"requests/models.py"</span>, line <span class="number">832</span>, <span class="keyword">in</span> raise_for_status</span><br><span class="line">    		<span class="keyword">raise</span> http_error</span><br></pre></td></tr></table></figure>
<p>requests.exceptions.HTTPError: 404 Client Error 但是，由于我们的例子中 r 的 status_code 是 200 ，当我们调用 raise_for_status() 时，得到的是：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">r.raise_for_status()</span><br><span class="line"><span class="keyword">None</span></span><br></pre></td></tr></table></figure>
<p>一切都挺和谐哈。</p>
<h6 id="6-10-响应头"><a href="#6-10-响应头" class="headerlink" title="6.10 响应头"></a>6.10 响应头</h6><p>我们可以查看以一个 Python 字典形式展示的服务器响应头：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line">r.headers</span><br><span class="line"></span><br><span class="line">&#123;</span><br><span class="line">    <span class="string">'content-encoding'</span>: <span class="string">'gzip'</span>,</span><br><span class="line">    <span class="string">'transfer-encoding'</span>: <span class="string">'chunked'</span>,</span><br><span class="line">    <span class="string">'connection'</span>: <span class="string">'close'</span>,</span><br><span class="line">    <span class="string">'server'</span>: <span class="string">'nginx/1.0.4'</span>,</span><br><span class="line">    <span class="string">'x-runtime'</span>: <span class="string">'148ms'</span>,</span><br><span class="line">    <span class="string">'etag'</span>: <span class="string">'"e1ca502697e5c9317743dc078f67693f"'</span>,</span><br><span class="line">    <span class="string">'content-type'</span>: <span class="string">'application/json'</span></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>但是这个字典比较特殊：它是仅为 HTTP 头部而生的。根据 RFC 2616， <strong>HTTP 头部是大小写不敏感的。</strong></p>
<p>因此，我们可以使用任意大写形式来访问这些响应头字段：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">r.headers[<span class="string">'Content-Type'</span>]</span><br><span class="line"><span class="string">'application/json'</span></span><br><span class="line"></span><br><span class="line">r.headers.get(<span class="string">'content-type'</span>)</span><br><span class="line"><span class="string">'application/json'</span></span><br></pre></td></tr></table></figure>
<p>它还有一个特殊点，那就是服务器可以多次接受同一 header，每次都使用不同的值。但 Requests 会将它们合并，这样它们就可以用一个映射来表示出来，参见 RFC 7230:</p>
<p>A recipient MAY combine multiple header fields with the same field name into one “field-name: field-value” pair, without changing the semantics of the message, by appending each subsequent field value to the combined field value in order, separated by a comma.</p>
<p>接收者可以合并多个相同名称的 header 栏位，把它们合为一个 “field-name: field-value” 配对，将每个后续的栏位值依次追加到合并的栏位值中，用逗号隔开即可，这样做不会改变信息的语义。</p>
<h6 id="6-11-Cookie"><a href="#6-11-Cookie" class="headerlink" title="6.11 Cookie"></a>6.11 Cookie</h6><p>如果某个响应中包含一些 cookie，你可以快速访问它们：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">url = <span class="string">'http://example.com/some/cookie/setting/url'</span></span><br><span class="line">r = requests.get(url)</span><br><span class="line"></span><br><span class="line">r.cookies[<span class="string">'example_cookie_name'</span>]</span><br><span class="line"><span class="string">'example_cookie_value'</span></span><br></pre></td></tr></table></figure>
<p>要想发送你的cookies到服务器，可以使用 cookies 参数：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">url = <span class="string">'http://httpbin.org/cookies'</span></span><br><span class="line">cookies = dict(cookies_are=<span class="string">'working'</span>)</span><br><span class="line"></span><br><span class="line">r = requests.get(url, cookies=cookies)</span><br><span class="line">r.text</span><br><span class="line"><span class="string">'&#123;"cookies": &#123;"cookies_are": "working"&#125;&#125;'</span></span><br></pre></td></tr></table></figure>
<p>Cookie 的返回对象为 RequestsCookieJar，它的行为和字典类似，但接口更为完整，适合跨域名跨路径使用。你还可以把 Cookie Jar 传到 Requests 中：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">jar = requests.cookies.RequestsCookieJar()</span><br><span class="line">jar.set(<span class="string">'tasty_cookie'</span>, <span class="string">'yum'</span>, domain=<span class="string">'httpbin.org'</span>, path=<span class="string">'/cookies'</span>)</span><br><span class="line">jar.set(<span class="string">'gross_cookie'</span>, <span class="string">'blech'</span>, domain=<span class="string">'httpbin.org'</span>, path=<span class="string">'/elsewhere'</span>)</span><br><span class="line">url = <span class="string">'http://httpbin.org/cookies'</span></span><br><span class="line">r = requests.get(url, cookies=jar)</span><br><span class="line">r.text</span><br><span class="line"><span class="string">'&#123;"cookies": &#123;"tasty_cookie": "yum"&#125;&#125;'</span></span><br></pre></td></tr></table></figure>
<h6 id="6-12-重定向与请求历史"><a href="#6-12-重定向与请求历史" class="headerlink" title="6.12 重定向与请求历史"></a>6.12 重定向与请求历史</h6><p>默认情况下，除了 HEAD, Requests 会自动处理所有重定向。</p>
<p>可以使用响应对象的 history 方法来追踪重定向。</p>
<p>Response.history 是一个 Response 对象的列表，为了完成请求而创建了这些对象。这个对象列表按照从最老到最近的请求进行排序。</p>
<p>例如，Github 将所有的 HTTP 请求重定向到 HTTPS：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line">r = requests.get(<span class="string">'http://github.com'</span>)</span><br><span class="line"></span><br><span class="line">r.url</span><br><span class="line"><span class="string">'https://github.com/'</span></span><br><span class="line"></span><br><span class="line">r.status_code</span><br><span class="line"><span class="number">200</span></span><br><span class="line"></span><br><span class="line">r.history</span><br><span class="line">[&lt;Response [<span class="number">301</span>]&gt;]</span><br></pre></td></tr></table></figure>
<p>如果你使用的是GET、OPTIONS、POST、PUT、PATCH 或者 DELETE，那么你可以通过 allow_redirects 参数禁用重定向处理：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">r = requests.get(<span class="string">'http://github.com'</span>, allow_redirects=<span class="keyword">False</span>)</span><br><span class="line">r.status_code</span><br><span class="line"><span class="number">301</span></span><br><span class="line">r.history</span><br><span class="line">[]</span><br></pre></td></tr></table></figure>
<p>如果你使用了 HEAD，你也可以启用重定向：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">r = requests.head(<span class="string">'http://github.com'</span>, allow_redirects=<span class="keyword">True</span>)</span><br><span class="line">r.url</span><br><span class="line"><span class="string">'https://github.com/'</span></span><br><span class="line">r.history</span><br><span class="line">[&lt;Response [<span class="number">301</span>]&gt;]</span><br></pre></td></tr></table></figure>
<h6 id="6-13-超时"><a href="#6-13-超时" class="headerlink" title="6.13 超时"></a>6.13 超时</h6><p>你可以告诉 requests 在经过以 timeout 参数设定的秒数时间之后停止等待响应。基本上所有的生产代码都应该使用这一参数。如果不使用，你的程序可能会永远失去响应：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">requests.get(<span class="string">'http://github.com'</span>, timeout=<span class="number">0.001</span>)</span><br><span class="line">Traceback (most recent call last):</span><br><span class="line">  File <span class="string">"&lt;stdin&gt;"</span>, line <span class="number">1</span>, <span class="keyword">in</span> &lt;module&gt;</span><br><span class="line">requests.exceptions.Timeout: HTTPConnectionPool(host=<span class="string">'github.com'</span>, port=<span class="number">80</span>): Request timed out. (timeout=<span class="number">0.001</span>)</span><br></pre></td></tr></table></figure>
<blockquote>
<p><strong>注意:</strong></p>
<p>timeout 仅对连接过程有效，与响应体的下载无关。 timeout 并不是整个下载响应的时间限制，而是如果服务器在 timeout 秒内没有应答，将会引发一个异常（更精确地说，是在 timeout 秒内没有从基础套接字上接收到任何字节的数据时）If no timeout is specified explicitly, requests do not time out.</p>
</blockquote>
<h6 id="错误与异常"><a href="#错误与异常" class="headerlink" title="错误与异常"></a>错误与异常</h6><p>遇到网络问题（如：DNS 查询失败、拒绝连接等）时，Requests 会抛出一个 ConnectionError 异常。</p>
<p>如果 HTTP 请求返回了不成功的状态码， Response.raise_for_status() 会抛出一个 HTTPError 异常。</p>
<p>若请求超时，则抛出一个 Timeout 异常。</p>
<p>若请求超过了设定的最大重定向次数，则会抛出一个 TooManyRedirects 异常。</p>
<p>所有Requests显式抛出的异常都继承自 requests.exceptions.RequestException 。</p>
<p><strong>案例1:爬取豆瓣电影的电影名、评分等信息</strong></p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> urllib.request</span><br><span class="line"><span class="keyword">from</span> urllib <span class="keyword">import</span> parse</span><br><span class="line"><span class="keyword">import</span> json</span><br><span class="line"></span><br><span class="line"><span class="string">"""</span></span><br><span class="line"><span class="string">获取豆瓣电影中的电影资源</span></span><br><span class="line"><span class="string">豆瓣电影url地址：https://movie.douban.com/explore#!type=movie&amp;tag=%E7%83%AD%E9%97%A8&amp;sort=recommend&amp;page_limit=20&amp;page_start=0</span></span><br><span class="line"><span class="string">分析：</span></span><br><span class="line"><span class="string">    1. 该页面中的的电影资源信息都是通过ajax异步加载进行刷新出来的</span></span><br><span class="line"><span class="string">    2. 在F12下的network中过滤XHR(XMLHTTPRESPONSE)请求，可以查看到真正的异步的请求地址如下</span></span><br><span class="line"><span class="string">        https://movie.douban.com/j/search_subjects?type=movie&amp;tag=%E7%83%AD%E9%97%A8&amp;sort=recommend&amp;page_limit=20&amp;page_start=20</span></span><br><span class="line"><span class="string">    3. 正在的请求地址中，type为类型，tag为标签（热门、经典、最新、爱情、科幻等等），sort为排序，page_limit为每一个的条数，page_start为开始的条数下标</span></span><br><span class="line"><span class="string">    4. 获取tag类型的url地址为： https://movie.douban.com/j/search_tags?type=movie&amp;source=</span></span><br><span class="line"><span class="string">"""</span></span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">urllib_open</span><span class="params">(url)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    公共的处理代码</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    header = &#123;</span><br><span class="line">        <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line">    req = urllib.request.Request(url=url, headers=header)</span><br><span class="line">    res = urllib.request.urlopen(req)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">return</span> res.read().decode(<span class="string">'utf-8'</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_movie_tag</span><span class="params">(url)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取电影的分类tag</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    tag_res = urllib_open(url)</span><br><span class="line">    <span class="comment"># 返回的tag_res的结果为'&#123;"tags":["热门","最新","经典","可播放","豆瓣高分","冷门佳片","华语","欧美","韩国","日本","动作","喜剧","爱情","科幻","悬疑","恐怖","成长"]&#125;'</span></span><br><span class="line">    <span class="comment"># 其结果为一个字符串类型的数据，需要将之转化为字典类型的</span></span><br><span class="line">    result = json.loads(tag_res)</span><br><span class="line">    content = result[<span class="string">'tags'</span>]</span><br><span class="line">    <span class="keyword">return</span> content</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_movies</span><span class="params">(tag_url, movies_url)</span>:</span></span><br><span class="line">    tag_content = get_movie_tag(tag_url)</span><br><span class="line">    <span class="comment"># 循环tag的内容，拼接出指定tag的电影内容</span></span><br><span class="line">    <span class="comment"># movies_url中指定电影类型的参数是tag=热门或者最新等等，所以需要进行tag的内容的编码</span></span><br><span class="line">    tag_list = []</span><br><span class="line">    print(tag_content)</span><br><span class="line">    <span class="keyword">for</span> tag <span class="keyword">in</span> tag_content:</span><br><span class="line">        data = &#123;<span class="string">'tag'</span>: tag&#125;</span><br><span class="line">        search_tag = parse.urlencode(data)</span><br><span class="line">        tag_list.append(search_tag)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">for</span> search_tag <span class="keyword">in</span> tag_list:</span><br><span class="line">        seatch_url = movies_url</span><br><span class="line">        seatch_url = seatch_url % (search_tag)</span><br><span class="line">        movies_res = urllib_open(seatch_url)</span><br><span class="line">        res = json.loads(movies_res)</span><br><span class="line">        result = res[<span class="string">'subjects'</span>]</span><br><span class="line">        <span class="keyword">for</span> res <span class="keyword">in</span> result:</span><br><span class="line">            print(<span class="string">'标题:%s，评分：%s'</span> % (res[<span class="string">'title'</span>], res[<span class="string">'rate'</span>]))</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line">    tag_url = <span class="string">'https://movie.douban.com/j/search_tags?type=movie&amp;source='</span></span><br><span class="line">    movies_url = <span class="string">'https://movie.douban.com/j/search_subjects?type=movie&amp;%s&amp;sort=recommend&amp;page_limit=20&amp;page_start=0'</span></span><br><span class="line">    get_movies(tag_url, movies_url)</span><br></pre></td></tr></table></figure>
<p><strong>案例2:爬取图片</strong> </p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">import</span> os</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">start_spider_image</span><span class="params">(url)</span>:</span></span><br><span class="line">    headers = &#123;</span><br><span class="line">        <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line">    req = requests.get(url, headers=headers)</span><br><span class="line">    soup = BeautifulSoup(req.text, <span class="string">'lxml'</span>)</span><br><span class="line">    imgs = soup.find_all(<span class="string">'img'</span>, <span class="string">'photo-item__img'</span>)</span><br><span class="line">    img_link_list = []</span><br><span class="line">    <span class="keyword">for</span> img <span class="keyword">in</span> imgs:</span><br><span class="line">        img_link = img.get(<span class="string">'data-big-src'</span>)</span><br><span class="line">        img_link_list.append(img_link)</span><br><span class="line"></span><br><span class="line">    <span class="comment"># 文件保存的路径</span></span><br><span class="line">    path = <span class="string">'/Users/lizhonglin/Desktop/Code/spider00/day02/image'</span></span><br><span class="line"></span><br><span class="line">    <span class="comment"># 获取文件名</span></span><br><span class="line">    <span class="keyword">for</span> i <span class="keyword">in</span> img_link_list:</span><br><span class="line">        <span class="comment"># 获取保存文件的文件名</span></span><br><span class="line">        filename = i.split(<span class="string">'?'</span>)[<span class="number">0</span>].split(<span class="string">'/'</span>)[<span class="number">-1</span>]</span><br><span class="line">        <span class="comment"># 文件写入操作</span></span><br><span class="line">        <span class="keyword">with</span> open(filename, <span class="string">'wb'</span>) <span class="keyword">as</span> f:</span><br><span class="line">            <span class="comment"># 切换到image目录</span></span><br><span class="line">            os.chdir(path)</span><br><span class="line">            <span class="comment"># 写入图片数据</span></span><br><span class="line">            f.write(requests.get(i).content)</span><br><span class="line">            f.close()</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line">    image = input(<span class="string">'请输入搜索关键词英文:'</span>)</span><br><span class="line">    <span class="comment"># 请求url</span></span><br><span class="line">    url = <span class="string">'https://www.pexels.com/search/'</span> + image + <span class="string">'/'</span></span><br><span class="line">    start_spider_image(url)</span><br></pre></td></tr></table></figure>
<p>​    <strong>在使用爬虫之前，我们需要了解到很多的概念知识，包括同步、异步概念， 阻塞、非阻塞概念，并发、并行概念，多线程、多进程概念，线程锁概念，协程概念等等</strong></p>
<h5 id="7-同步异步"><a href="#7-同步异步" class="headerlink" title="7 . 同步异步"></a>7 . 同步异步</h5><ol>
<li><h6 id="同步"><a href="#同步" class="headerlink" title="同步"></a>同步</h6><ul>
<li>不同程序单元为了完成某个任务，在执行过程中需靠某种通信方式以<strong>协调一致</strong>，称这些程序单元是同步执行的。</li>
<li>例如购物系统中更新商品库存，需要用“行锁”作为通信信号，让不同的更新请求强制排队顺序执行，那更新库存的操作是同步的。</li>
<li>简言之，<strong>同步意味着有序</strong>。</li>
</ul>
</li>
<li><h6 id="异步"><a href="#异步" class="headerlink" title="异步"></a>异步</h6><ul>
<li>为完成某个任务，不同程序单元之间<strong>过程中无需通信协调</strong>，也能完成任务的方式。</li>
<li>不相关的程序单元之间可以是异步的。</li>
<li>例如，爬虫下载网页。调度程序调用下载程序后，即可调度其他任务，而无需与该下载任务保持通信以协调行为。不同网页的下载、保存等操作都是无关的，也无需相互通知协调。这些异步操作的完成时刻并不确定。</li>
<li>简言之，<strong>异步意味着无序</strong>。</li>
</ul>
<p>上文提到的“通信方式”通常是指异步和并发编程提供的同步原语，如信号量、锁、同步队列等等。我们需知道，虽然这些通信方式是为了让多个程序在一定条件下同步执行，但正因为是异步的存在，才需要这些通信方式。如果所有程序都是按序执行，其本身就是同步的，又何需这些同步信号呢？</p>
<p>​</p>
</li>
</ol>
<h5 id="8-阻塞非阻塞"><a href="#8-阻塞非阻塞" class="headerlink" title="8. 阻塞非阻塞"></a>8. 阻塞非阻塞</h5><ol>
<li><p>阻塞</p>
<ul>
<li>程序未得到所需计算资源时被挂起的状态。</li>
<li><strong>程序在等待某个操作完成期间，自身无法继续干别的事情，则称该程序在该操作上是阻塞的。</strong></li>
<li>常见的阻塞形式有：网络I/O阻塞、磁盘I/O阻塞、用户输入阻塞等。</li>
</ul>
<p>阻塞是无处不在的，包括CPU切换上下文时，所有的进程都无法真正干事情，它们也会被阻塞。（如果是多核CPU则正在执行上下文切换操作的核不可被利用。）</p>
<p>​</p>
</li>
<li><p>非阻塞</p>
<ul>
<li><strong>程序在等待某操作过程中，自身不被阻塞，可以继续运行干别的事情，则称该程序在该操作上是非阻塞的。</strong></li>
<li>非阻塞并<strong>不是</strong>在任何程序级别、任何情况下都可以存在的。</li>
<li>仅当程序封装的级别可以囊括独立的子程序单元时，它才可能存在非阻塞状态。</li>
</ul>
<p>非阻塞的存在是因为阻塞存在，正因为某个操作阻塞导致的耗时与效率低下，我们才要把它变成非阻塞的。</p>
<p>​</p>
</li>
</ol>
<h5 id="9-同步和阻塞的区别"><a href="#9-同步和阻塞的区别" class="headerlink" title="9. 同步和阻塞的区别"></a>9. 同步和阻塞的区别</h5><p>同步是一个过程，阻塞是线程的一个状态。</p>
<p>当多个线程操作同一公共变量的时候可能会出现竞争的情况，这时候需要使用同步来防止多个线程同时占用资源的情况，让一个线程在运行状态中，另外的线程处于就绪状态，当前一个线程处于暂停状态的时候，后面的处于就绪状态的线程，获取到资源以后，获取到时间片以后就会处于运行状态了。所以阻塞是线程的一个状态而已</p>
<h5 id="10-并发-并行"><a href="#10-并发-并行" class="headerlink" title="10. 并发 并行"></a>10. 并发 并行</h5><ol>
<li><p>并发</p>
<ul>
<li>并发描述的是程序的组织结构。指程序要被设计成多个可独立执行的子任务。</li>
<li><strong>以利用有限的计算机资源使多个任务可以被实时或近实时执行为目的。</strong></li>
</ul>
</li>
<li><p>并行</p>
<ul>
<li>并行描述的是程序的执行状态。指多个任务同时被执行。</li>
<li><strong>以利用富余计算资源（多核CPU）加速完成多个任务为目的。</strong></li>
</ul>
<p>并发提供了一种程序组织结构方式，让问题的解决方案可以并行执行，但并行执行不是必须的。</p>
<p>​</p>
</li>
</ol>
<blockquote>
<p>总结:</p>
<ul>
<li><strong>并行</strong>是为了利用多核加速多任务完成的进度</li>
<li><strong>并发</strong>是为了让独立的子任务都有机会被尽快执行，但不一定能加速整体进度</li>
<li><strong>非阻塞</strong>是为了提高程序整体执行效率</li>
<li><strong>异步</strong>是高效地组织非阻塞任务的方式</li>
</ul>
<p>要支持并发，必须拆分为多任务，不同任务相对而言才有阻塞/非阻塞、同步/异步。所以，并发、异步、非阻塞三个词总是如影随形。</p>
</blockquote>
<p>上面有这样的案例我们可以改造一下案例一,爬取豆瓣电影的名称和评分[使用多线程来实现]</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br><span class="line">91</span><br><span class="line">92</span><br><span class="line">93</span><br><span class="line">94</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> time</span><br><span class="line"><span class="keyword">import</span> threading</span><br><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">from</span> urllib <span class="keyword">import</span> parse</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_tags</span><span class="params">(url)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取电影分类</span></span><br><span class="line"><span class="string">    :param url:</span></span><br><span class="line"><span class="string">    :return: 电影分类信息</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    headers = &#123;</span><br><span class="line">        <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    req = requests.get(url, headers=headers)</span><br><span class="line">    <span class="keyword">return</span> req.json()</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_movie</span><span class="params">(url)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取电影信息</span></span><br><span class="line"><span class="string">    :param url:</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    headers = &#123;</span><br><span class="line">        <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'</span></span><br><span class="line">    &#125;</span><br><span class="line">    req = requests.get(url, headers=headers)</span><br><span class="line">    result = req.json()</span><br><span class="line">    <span class="keyword">for</span> i <span class="keyword">in</span> result[<span class="string">'subjects'</span>]:</span><br><span class="line">        title = i[<span class="string">'title'</span>]</span><br><span class="line">        rate = i[<span class="string">'rate'</span>]</span><br><span class="line">        <span class="comment">#  这步可以在优化一下 存储到数据库而不使用print.</span></span><br><span class="line">        print(<span class="string">'名称:%s ,评分:%s'</span> % (title, rate))</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">GetMovie</span><span class="params">(threading.Thread)</span>:</span></span><br><span class="line">    <span class="string">"""定义多线程类"""</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self)</span>:</span></span><br><span class="line">        super(GetMovie, self).__init__()</span><br><span class="line">        <span class="comment"># 设置线程锁</span></span><br><span class="line">        self.movie_lock = threading.Lock()</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">update_movie_lock</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="comment"># 判断是否被锁住</span></span><br><span class="line">        <span class="keyword">if</span> self.movie_lock.acquire():</span><br><span class="line">            <span class="comment"># 如果被锁住就弹出一个连接,如果所有结果都被弹出了 就返回空</span></span><br><span class="line">            link = movie_url_list.pop() <span class="keyword">if</span> movie_url_list <span class="keyword">else</span> <span class="string">''</span></span><br><span class="line">            <span class="comment"># 释放线程锁</span></span><br><span class="line">            self.movie_lock.release()</span><br><span class="line">            <span class="keyword">return</span> link</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">run</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="comment"># 获取url的地址</span></span><br><span class="line">        link = self.update_movie_lock()</span><br><span class="line">        <span class="comment"># 判断是否有url</span></span><br><span class="line">        <span class="keyword">if</span> link:</span><br><span class="line">            <span class="comment"># 获取资源</span></span><br><span class="line">            get_movie(link)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line">    <span class="comment"># 分类的url</span></span><br><span class="line">    url = <span class="string">'https://movie.douban.com/j/search_tags?type=movie&amp;source='</span></span><br><span class="line">    <span class="comment"># 电影信息的url</span></span><br><span class="line">    movie_url = <span class="string">'https://movie.douban.com/j/search_subjects?type=movie&amp;%s&amp;sort=recommend&amp;page_limit=20&amp;page_start=0'</span></span><br><span class="line">    <span class="comment"># 分类的所有信息</span></span><br><span class="line">    tags_list = get_tags(url)</span><br><span class="line">    <span class="comment"># 定义全局变量</span></span><br><span class="line">    <span class="keyword">global</span> movie_url_list</span><br><span class="line">    <span class="comment"># 定义所有的电影资源的url空列表</span></span><br><span class="line">    movie_url_list = []</span><br><span class="line"></span><br><span class="line">    <span class="keyword">for</span> tag <span class="keyword">in</span> tags_list[<span class="string">'tags'</span>]:</span><br><span class="line">        <span class="comment"># 循环遍历分类信息</span></span><br><span class="line">        data = &#123;<span class="string">'tag'</span>: tag&#125;</span><br><span class="line">        <span class="comment"># 组装新的url</span></span><br><span class="line">        m_url = movie_url % parse.urlencode(data)</span><br><span class="line">        movie_url_list.append(m_url)</span><br><span class="line">    </span><br><span class="line">    <span class="comment"># 启动多线程</span></span><br><span class="line">    <span class="keyword">while</span> <span class="keyword">True</span>:</span><br><span class="line">        <span class="comment"># 如果movie_url_list有值</span></span><br><span class="line">        <span class="keyword">if</span> movie_url_list:</span><br><span class="line">            a1 = GetMovie()</span><br><span class="line">            a2 = GetMovie()</span><br><span class="line"></span><br><span class="line">            a1.start()</span><br><span class="line">            a2.start()</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="keyword">break</span></span><br></pre></td></tr></table></figure>

      
    </div>
    
    
    

    

    
      <div>
        <div style="padding: 10px 0; margin: 20px auto; width: 90%; text-align: center;">
  <div></div>
  <button id="rewardButton" disable="enable" onclick="var qr = document.getElementById('QR'); if (qr.style.display === 'none') {qr.style.display='block';} else {qr.style.display='none'}">
    <span>打赏</span>
  </button>
  <div id="QR" style="display: none;">

    
      <div id="wechat" style="display: inline-block">
        <img id="wechat_qr" src="/WechatIMG26.jpeg" alt="Mr. Lee 微信支付"/>
        <p>微信支付</p>
      </div>
    

    

    

  </div>
</div>

      </div>
    

    

    <div>
      
        <div>
    
        <div style="text-align:center;color: #ccc;font-size:14px;">
            -------------本文结束
            <i class="fa fa-paw"></i>
            感谢您的阅读-------------
        </div>
    
</div>
      
    </div>

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/Spider基础/" rel="tag"><i class="fa fa-tag"></i> Spider基础</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2018/06/30/Ajax写法/" rel="next" title="Ajax写法">
                <i class="fa fa-chevron-left"></i> Ajax写法
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2018/06/30/mongodb常用命令/" rel="prev" title="mongodb常用命令">
                mongodb常用命令 <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image"
                src="/avatar.png"
                alt="Mr. Lee" />
            
              <p class="site-author-name" itemprop="name">Mr. Lee</p>
              <p class="site-description motion-element" itemprop="description"></p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/archives/">
              
                  <span class="site-state-item-count">33</span>
                  <span class="site-state-item-name">日志</span>
                </a>
              </div>
            

            

            
              
              
              <div class="site-state-item site-state-tags">
                <a href="/tags/index.html">
                  <span class="site-state-item-count">30</span>
                  <span class="site-state-item-name">标签</span>
                </a>
              </div>
            

          </nav>

          

          
            <div class="links-of-author motion-element">
                
                  <span class="links-of-author-item">
                    <a href="https://github.com/Leezhonglin" target="_blank" title="GitHub">
                      
                        <i class="fa fa-fw fa-github"></i>GitHub</a>
                  </span>
                
                  <span class="links-of-author-item">
                    <a href="mailto:380604322@qq.com" target="_blank" title="E-Mail">
                      
                        <i class="fa fa-fw fa-envelope"></i>E-Mail</a>
                  </span>
                
            </div>
          

          
          

          
          
            <div class="links-of-blogroll motion-element links-of-blogroll-block">
              <div class="links-of-blogroll-title">
                <i class="fa  fa-fw fa-link"></i>
                友情链接
              </div>

              <ul class="links-of-blogroll-list">

                
                    <span class="links-of-author-item" style="text-align:center">
                      <a href="https://my.csdn.net/jackfrued/" title="大 神" target="_blank">
                        大 神
                      </a>
                    </span>
                
                    <span class="links-of-author-item" style="text-align:center">
                      <a href="https://blog.csdn.net/qq_33196814" title="CSDN" target="_blank">
                        CSDN
                      </a>
                    </span>
                
                    <span class="links-of-author-item" style="text-align:center">
                      <a href="http://www.python.org" title="python" target="_blank">
                        python
                      </a>
                    </span>
                
                    <span class="links-of-author-item" style="text-align:center">
                      <a href="http://redisdoc.com/" title="redis" target="_blank">
                        redis
                      </a>
                    </span>
                

              </ul>


            </div>
          

          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-4"><a class="nav-link" href="#Spider学习笔记"><span class="nav-number">1.</span> <span class="nav-text">Spider学习笔记</span></a><ol class="nav-child"><li class="nav-item nav-level-5"><a class="nav-link" href="#前言"><span class="nav-number">1.1.</span> <span class="nav-text">前言:</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#1-数据分析和采集"><span class="nav-number">1.2.</span> <span class="nav-text">1. 数据分析和采集</span></a><ol class="nav-child"><li class="nav-item nav-level-6"><a class="nav-link" href="#1-1-数据分析"><span class="nav-number">1.2.1.</span> <span class="nav-text">1.1 数据分析</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#1-2-数据采集"><span class="nav-number">1.2.2.</span> <span class="nav-text">1.2 数据采集</span></a></li></ol></li><li class="nav-item nav-level-5"><a class="nav-link" href="#2-请求头分析"><span class="nav-number">1.3.</span> <span class="nav-text">2. 请求头分析</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#3-urllib库的使用"><span class="nav-number">1.4.</span> <span class="nav-text">3.urllib库的使用</span></a><ol class="nav-child"><li class="nav-item nav-level-6"><a class="nav-link" href="#3-1-发起请求"><span class="nav-number">1.4.1.</span> <span class="nav-text">3.1 发起请求</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#3-2-简单抓取网页"><span class="nav-number">1.4.2.</span> <span class="nav-text">3.2 简单抓取网页</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#3-2-设置请求超时"><span class="nav-number">1.4.3.</span> <span class="nav-text">3.2 设置请求超时</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#3-3-ssl认证"><span class="nav-number">1.4.4.</span> <span class="nav-text">3.3 ssl认证</span></a></li></ol></li><li class="nav-item nav-level-5"><a class="nav-link" href="#4-使用urllib进行中文的编码和解码"><span class="nav-number">1.5.</span> <span class="nav-text">4. 使用urllib进行中文的编码和解码</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#5-数据采集"><span class="nav-number">1.6.</span> <span class="nav-text">5. 数据采集</span></a><ol class="nav-child"><li class="nav-item nav-level-6"><a class="nav-link" href="#5-1-BeautifSoup库的使用"><span class="nav-number">1.6.1.</span> <span class="nav-text">5.1 BeautifSoup库的使用</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#5-2-解析语法、find、find-all"><span class="nav-number">1.6.2.</span> <span class="nav-text">5.2 解析语法、find、find_all</span></a></li></ol></li><li class="nav-item nav-level-5"><a class="nav-link" href="#6-requests库的使用"><span class="nav-number">1.7.</span> <span class="nav-text">6. requests库的使用</span></a><ol class="nav-child"><li class="nav-item nav-level-6"><a class="nav-link" href="#6-1安装"><span class="nav-number">1.7.1.</span> <span class="nav-text">6.1安装</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-2-发送请求，GET、POST、PUT、PATCH、DELETE"><span class="nav-number">1.7.2.</span> <span class="nav-text">6.2 发送请求，GET、POST、PUT、PATCH、DELETE</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-3-传递-URL-参数"><span class="nav-number">1.7.3.</span> <span class="nav-text">6.3 传递 URL 参数</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-4-响应内容"><span class="nav-number">1.7.4.</span> <span class="nav-text">6.4 响应内容</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-5二进制响应内容"><span class="nav-number">1.7.5.</span> <span class="nav-text">6.5二进制响应内容</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-6-JSON-响应内容"><span class="nav-number">1.7.6.</span> <span class="nav-text">6.6 JSON 响应内容</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-7-原始响应内容"><span class="nav-number">1.7.7.</span> <span class="nav-text">6.7 原始响应内容</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-8-定制请求头"><span class="nav-number">1.7.8.</span> <span class="nav-text">6.8 定制请求头</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#"><span class="nav-number">1.7.9.</span> <span class="nav-text"> </span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-9-响应状态码"><span class="nav-number">1.7.10.</span> <span class="nav-text">6.9 响应状态码</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-10-响应头"><span class="nav-number">1.7.11.</span> <span class="nav-text">6.10 响应头</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-11-Cookie"><span class="nav-number">1.7.12.</span> <span class="nav-text">6.11 Cookie</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-12-重定向与请求历史"><span class="nav-number">1.7.13.</span> <span class="nav-text">6.12 重定向与请求历史</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#6-13-超时"><span class="nav-number">1.7.14.</span> <span class="nav-text">6.13 超时</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#错误与异常"><span class="nav-number">1.7.15.</span> <span class="nav-text">错误与异常</span></a></li></ol></li><li class="nav-item nav-level-5"><a class="nav-link" href="#7-同步异步"><span class="nav-number">1.8.</span> <span class="nav-text">7 . 同步异步</span></a><ol class="nav-child"><li class="nav-item nav-level-6"><a class="nav-link" href="#同步"><span class="nav-number">1.8.1.</span> <span class="nav-text">同步</span></a></li><li class="nav-item nav-level-6"><a class="nav-link" href="#异步"><span class="nav-number">1.8.2.</span> <span class="nav-text">异步</span></a></li></ol></li><li class="nav-item nav-level-5"><a class="nav-link" href="#8-阻塞非阻塞"><span class="nav-number">1.9.</span> <span class="nav-text">8. 阻塞非阻塞</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#9-同步和阻塞的区别"><span class="nav-number">1.10.</span> <span class="nav-text">9. 同步和阻塞的区别</span></a></li><li class="nav-item nav-level-5"><a class="nav-link" href="#10-并发-并行"><span class="nav-number">1.11.</span> <span class="nav-text">10. 并发 并行</span></a></li></ol></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<div class="copyright">&copy; <span itemprop="copyrightYear">2019</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">Mr. Lee</span>

  
</div>


  <span class="post-meta-divider">|</span>



  <div class="powered-by">由 <a class="theme-link" target="_blank" href="https://github.com/Leezhonglin">Leezhonglin</a> 提供技术支持</div>



  <span class="post-meta-divider">|</span>


<div class="powered-by">
<i class="fa fa-user-md"></i><span id="busuanzi_container_site_uv">
  本站访客数:<span id="busuanzi_value_site_uv"></span>
</span>


  <span class="post-meta-divider">|</span>


<span id="busuanzi_container_site_pv">
    本站总访问量:<span id="busuanzi_value_site_pv"></span>次
</span>


  <span class="post-meta-divider">|</span>


</div>


<div class="theme-info">
  <div class="powered-by"></div>
  <span class="post-count">博客全站共:64.5k字</span>
</div>


  <span class="post-meta-divider">|</span>




        







        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  












  
  
    <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  


  


  <script type="text/javascript" src="/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=5.1.4"></script>



  
  


  <script type="text/javascript" src="/js/src/affix.js?v=5.1.4"></script>

  <script type="text/javascript" src="/js/src/schemes/pisces.js?v=5.1.4"></script>



  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  












  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url);
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  

  
  

  

  

  

</body>
</html>
